# Import necessary libraries
#Nisha tyagi
!pip install wordcloud Wordcloud
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from sklearn.model_selection import train_test_split
from sklearn.linear_model import PassiveAggressiveRegressor
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
Requirement already satisfied: wordcloud in c:\users\91935\anaconda3\lib\site-packages (1.9.2) Requirement already satisfied: numpy>=1.6.1 in c:\users\91935\anaconda3\lib\site-packages (from wordcloud) (1.24.3) Requirement already satisfied: pillow in c:\users\91935\anaconda3\lib\site-packages (from wordcloud) (9.4.0) Requirement already satisfied: matplotlib in c:\users\91935\anaconda3\lib\site-packages (from wordcloud) (3.7.1) Requirement already satisfied: contourpy>=1.0.1 in c:\users\91935\anaconda3\lib\site-packages (from matplotlib->wordcloud) (1.0.5) Requirement already satisfied: cycler>=0.10 in c:\users\91935\anaconda3\lib\site-packages (from matplotlib->wordcloud) (0.11.0) Requirement already satisfied: fonttools>=4.22.0 in c:\users\91935\anaconda3\lib\site-packages (from matplotlib->wordcloud) (4.25.0) Requirement already satisfied: kiwisolver>=1.0.1 in c:\users\91935\anaconda3\lib\site-packages (from matplotlib->wordcloud) (1.4.4) Requirement already satisfied: packaging>=20.0 in c:\users\91935\anaconda3\lib\site-packages (from matplotlib->wordcloud) (23.0) Requirement already satisfied: pyparsing>=2.3.1 in c:\users\91935\anaconda3\lib\site-packages (from matplotlib->wordcloud) (3.0.9) Requirement already satisfied: python-dateutil>=2.7 in c:\users\91935\anaconda3\lib\site-packages (from matplotlib->wordcloud) (2.8.2) Requirement already satisfied: six>=1.5 in c:\users\91935\anaconda3\lib\site-packages (from python-dateutil>=2.7->matplotlib->wordcloud) (1.16.0)
# Load the data
data = pd.read_csv('instagram_data.csv', encoding='ISO-8859-1')
df = pd.DataFrame(data)
df
| Impressions | From Home | From Hashtags | From Explore | From Other | Saves | Comments | Shares | Likes | Profile Visits | Follows | Caption | Hashtags | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 3920 | 2586 | 1028 | 619 | 56 | 98 | 9 | 5 | 162 | 35 | 2 | Here are some of the most important data visua... | #finance #money #business #investing #investme... |
| 1 | 5394 | 2727 | 1838 | 1174 | 78 | 194 | 7 | 14 | 224 | 48 | 10 | Here are some of the best data science project... | #healthcare #health #covid #data #datascience ... |
| 2 | 4021 | 2085 | 1188 | 0 | 533 | 41 | 11 | 1 | 131 | 62 | 12 | Learn how to train a machine learning model an... | #data #datascience #dataanalysis #dataanalytic... |
| 3 | 4528 | 2700 | 621 | 932 | 73 | 172 | 10 | 7 | 213 | 23 | 8 | Heres how you can write a Python program to d... | #python #pythonprogramming #pythonprojects #py... |
| 4 | 2518 | 1704 | 255 | 279 | 37 | 96 | 5 | 4 | 123 | 8 | 0 | Plotting annotations while visualizing your da... | #datavisualization #datascience #data #dataana... |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 114 | 13700 | 5185 | 3041 | 5352 | 77 | 573 | 2 | 38 | 373 | 73 | 80 | Here are some of the best data science certifi... | #datascience #datasciencejobs #datasciencetrai... |
| 115 | 5731 | 1923 | 1368 | 2266 | 65 | 135 | 4 | 1 | 148 | 20 | 18 | Clustering is a machine learning technique use... | #machinelearning #machinelearningalgorithms #d... |
| 116 | 4139 | 1133 | 1538 | 1367 | 33 | 36 | 0 | 1 | 92 | 34 | 10 | Clustering music genres is a task of grouping ... | #machinelearning #machinelearningalgorithms #d... |
| 117 | 32695 | 11815 | 3147 | 17414 | 170 | 1095 | 2 | 75 | 549 | 148 | 214 | Here are some of the best data science certifi... | #datascience #datasciencejobs #datasciencetrai... |
| 118 | 36919 | 13473 | 4176 | 16444 | 2547 | 653 | 5 | 26 | 443 | 611 | 228 | 175 Python Projects with Source Code solved an... | #python #pythonprogramming #pythonprojects #py... |
119 rows × 13 columns
# Check for missing values
data.isnull().sum()
Impressions 0 From Home 0 From Hashtags 0 From Explore 0 From Other 0 Saves 0 Comments 0 Shares 0 Likes 0 Profile Visits 0 Follows 0 Caption 0 Hashtags 0 dtype: int64
data.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 119 entries, 0 to 118 Data columns (total 13 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Impressions 119 non-null int64 1 From Home 119 non-null int64 2 From Hashtags 119 non-null int64 3 From Explore 119 non-null int64 4 From Other 119 non-null int64 5 Saves 119 non-null int64 6 Comments 119 non-null int64 7 Shares 119 non-null int64 8 Likes 119 non-null int64 9 Profile Visits 119 non-null int64 10 Follows 119 non-null int64 11 Caption 119 non-null object 12 Hashtags 119 non-null object dtypes: int64(11), object(2) memory usage: 12.2+ KB
# Plot distribution of Impressions from different sources
plt.figure(figsize=(10, 8))
plt.style.use('fivethirtyeight')
plt.title("Distribution of Impressions From Home")
sns.distplot(data['From Home'])
plt.show()
C:\Users\91935\AppData\Local\Temp\ipykernel_2804\2973166774.py:5: UserWarning: `distplot` is a deprecated function and will be removed in seaborn v0.14.0. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). For a guide to updating your code to use the new functions, please see https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751 sns.distplot(data['From Home'])
plt.figure(figsize=(10, 8))
plt.title("Distribution of Impressions From Hashtags")
sns.distplot(data['From Hashtags'])
plt.show()
C:\Users\91935\AppData\Local\Temp\ipykernel_2804\3618955972.py:3: UserWarning: `distplot` is a deprecated function and will be removed in seaborn v0.14.0. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). For a guide to updating your code to use the new functions, please see https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751 sns.distplot(data['From Hashtags'])
plt.figure(figsize=(10, 8))
plt.title("Distribution of Impressions From Explore")
sns.distplot(data['From Explore'])
plt.show()
C:\Users\91935\AppData\Local\Temp\ipykernel_2804\3391790389.py:3: UserWarning: `distplot` is a deprecated function and will be removed in seaborn v0.14.0. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). For a guide to updating your code to use the new functions, please see https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751 sns.distplot(data['From Explore'])
# Plot pie chart for Impressions from different sources using Plotly
home = data["From Home"].sum()
hashtags = data["From Hashtags"].sum()
explore = data["From Explore"].sum()
other = data["From Other"].sum()
labels = ['From Home', 'From Hashtags', 'From Explore', 'Other']
values = [home, hashtags, explore, other]
fig = px.pie(data, values=values, names=labels, title="Impressions on Instagram Posts From Various Sources",hole=0.5)
fig.show()
# Generate and plot word cloud for captions and hashtags
text = " ".join(i for i in data.Caption)
stopwords = set(STOPWORDS)
wordcloud = WordCloud(stopwords=stopwords, background_color="white").generate(text)
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
plt.show()
text_hashtags = " ".join(i for i in data.Hashtags)
stopwords_hashtags = set(STOPWORDS)
wordcloud_hashtags = WordCloud(stopwords=stopwords_hashtags, background_color="white").generate(text_hashtags)
plt.figure(figsize=(12, 10))
plt.imshow(wordcloud_hashtags, interpolation="bilinear")
plt.axis("off")
plt.show()
# Scatter plots for relationships between variables using Plotly Express
fig = px.scatter(data_frame=data, x="Impressions", y="Likes", size="Likes", trendline="ols", title="Relationship Between Likes and Total Impressions")
fig.show()
fig = px.scatter(data_frame=data, x="Impressions", y="Comments", size="Comments", trendline="ols", title="Relationship Between Comments and Total Impressions")
fig.show()
fig = px.scatter(data_frame=data, x="Impressions", y="Shares", size="Shares", trendline="ols", title="Relationship Between Shares and Total Impressions")
fig.show()
fig = px.scatter(data_frame=data, x="Impressions", y="Saves", size="Saves", trendline="ols", title="Relationship Between Post Saves and Total Impressions")
fig.show()
# Calculate and print correlation values for different variables
correlation = data.corr()
print(correlation["Impressions"].sort_values(ascending=False))
Impressions 1.000000 From Explore 0.893607 Follows 0.889363 Likes 0.849835 From Home 0.844698 Saves 0.779231 Profile Visits 0.760981 Shares 0.634675 From Other 0.592960 From Hashtags 0.560760 Comments -0.028524 Name: Impressions, dtype: float64
C:\Users\91935\AppData\Local\Temp\ipykernel_2804\381329730.py:2: FutureWarning: The default value of numeric_only in DataFrame.corr is deprecated. In a future version, it will default to False. Select only valid columns or specify the value of numeric_only to silence this warning.
# Calculate and print conversion rate
conversion_rate = (data["Follows"].sum() / data["Profile Visits"].sum()) * 100
print(conversion_rate)
41.00265604249668
# Scatter plot for relationship between Profile Visits and Followers Gained
fig = px.scatter(data_frame=data, x="Profile Visits", y="Follows", size="Follows", trendline="ols", title="Relationship Between Profile Visits and Followers Gained")
fig.show()
# Train a PassiveAggressiveRegressor model
x = np.array(data[['Likes', 'Saves', 'Comments', 'Shares', 'Profile Visits', 'Follows']])
y = np.array(data["Impressions"])
xtrain, xtest, ytrain, ytest = train_test_split(x, y, test_size=0.2, random_state=42)
model = PassiveAggressiveRegressor()
model.fit(xtrain, ytrain)
score = model.score(xtest, ytest)
print(score)
0.8544639282017199
# Predict with the trained model
features = np.array([[282.0, 233.0, 4.0, 9.0, 165.0, 54.0]])
prediction = model.predict(features)
print(prediction)
[10320.43240352]